rm(list=ls())
library(ezids)
library(ggplot2)
library(dplyr)
library(readr)
library(tidyverse)
library (tidyr)
library(janitor)
library(scales)
library(ggrepel)
library(corrplot)
This project uses the State and County Housing Market Indicators dataset from the American Enterprise Institute Housing Center, found here. The variables are:
| Original Variable Name | New Variable Name | Definition |
|---|---|---|
| State | State | state |
| County | County_Name | County |
| FIPS | FIPS_County_Code | 5-digit Federal Information Processing Series codes (first 2 digits indicate state, last 3 indicate sub-county entity) |
| Year | Year | Year when the data was collected |
| Tier | Affordability | Categorizes home sales into entry-level (<=80th percentile of FHA sales prices), move-up (all others), and all |
| Median.Sale.Price..in.Thousands. | Median_Sale_Price_in_k | Median sale price in thousands of USD per county |
| House.Price.Appreciation.since.2012 | House_Price_Appreciation_since_2012_percent | Cumulative home price appreciation since 2012 |
| House.Price.Appreciation..Year.over.Year | House_Price_Appreciation_yr_over_yr_percent | Home price appreciation since the previous year |
| Months..Supply | Months_Supply | Number of months it would take for the inventory of existing homes for sale to be exhausted at the current sales pace |
| New.Construction.Share.of.Sales | New_Constr_by_share_of_sales_percent | Percent of sales comprising new construction |
| Mortgage.Default.Rate | Mortgage_Default_Rate_percent | AEI Mortgage Default Rate, a measure of how loans originating in a given month would perform under the same conditions as the 2007 financial crisis (<=7%: Low Risk; between 7.01% and 14%: Medium Risk; >14%: High Risk) |
housing = read.csv("/Users/ilgazkuscu/Documents/GitHub/housing-price-vs-supply-2024/Data/state_county_data_download_2025.csv")
housing %>% slice_sample(n=5)
## State County FIPS Year Tier Median.Sale.Price..in.Thousands.
## 1 GA Jasper County 13159 2023 all $234
## 2 NC Hyde County 37095 2017 all $157
## 3 ID Canyon County 16027 2023 all $394
## 4 PA Mckean County 42083 2019 all $65
## 5 TX Ward County 48475 2018 entrylevel $108
## House.Price.Appreciation.since.2012 House.Price.Appreciation..Year.over.Year.
## 1 188.40% 2.10%
## 2 34.30% 14.80%
## 3 253.00% 1.90%
## 4 19.50% 6.00%
## 5 39.80% 9.20%
## Months..Supply New.Construction.Share.of.Sales Mortgage.Default.Rate
## 1 4.7 21.40% 19.40%
## 2 6.5 3.30% 9.00%
## 3 5.5 36.10% 16.20%
## 4 6.4 3.10% 17.70%
## 5 1.4 4.50% 25.50%
The data is limited to the year 2024 and cleaned of NA values, and the variables are renamed for clarity.
housing_2024 = housing %>% filter(housing$Year == 2024,
housing$State != 'AA National',
housing$County != 'AA State') %>% na.omit %>%
#excluding the armed forces
#rename cols
rename(
Median_Sale_Price_per_k = Median.Sale.Price..in.Thousands.,
House_Price_Appreciation_yr_over_yr_percent = House.Price.Appreciation..Year.over.Year.,
House_Price_Appreciation_since_2012_percent = House.Price.Appreciation.since.2012,
Months_Supply = Months..Supply,
New_Constr_by_share_of_sales_percent = New.Construction.Share.of.Sales,
Mortgage_Default_Rate_percent = Mortgage.Default.Rate,
County_Name = County,
FIPS_County_Code = FIPS,
Affordability = Tier
)
head(housing_2024)
## State County_Name FIPS_County_Code Year Affordability
## 4 AK Aleutians West Census Area 2016 2024 all
## 5 AK Aleutians West Census Area 2016 2024 entrylevel
## 6 AK Aleutians West Census Area 2016 2024 moveup
## 7 AK Anchorage Municipality 2020 2024 all
## 8 AK Anchorage Municipality 2020 2024 entrylevel
## 9 AK Anchorage Municipality 2020 2024 moveup
## Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 4
## 5
## 6
## 7 $415 51.90%
## 8 $325 49.70%
## 9 $581 54.20%
## House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 4 1.8
## 5 2.8
## 6 1.6
## 7 4.50% 2.5
## 8 4.00% 2.2
## 9 5.00% 2.8
## New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 4
## 5
## 6
## 7 4.40% 12.40%
## 8 1.90% 13.60%
## 9 7.90% 9.80%
The typing of the variables is also corrected. Some require the symbols “$” and “%” to be removed beforehand, so that is also done.
# as factors
housing_2024$State = as.factor(housing_2024$State)
housing_2024$County_Name = as.factor(housing_2024$County_Name)
housing_2024$FIPS_County_Code = as.factor(housing_2024$FIPS_County_Code)
housing_2024$Affordability = as.factor(housing_2024$Affordability)
# remove prefixes '$' and '%' from values
housing_2024 = housing_2024 %>%
mutate(Median_Sale_Price_per_k = gsub("\\$", "", Median_Sale_Price_per_k),
House_Price_Appreciation_since_2012_percent =
gsub("%","",House_Price_Appreciation_since_2012_percent),
House_Price_Appreciation_yr_over_yr_percent =
gsub("%","",House_Price_Appreciation_yr_over_yr_percent),
New_Constr_by_share_of_sales_percent = gsub("%","",New_Constr_by_share_of_sales_percent),
Mortgage_Default_Rate_percent = gsub("%","",Mortgage_Default_Rate_percent)
)
# as num instead of chr
housing_2024$Median_Sale_Price_per_k = as.numeric(housing_2024$Median_Sale_Price_per_k)
housing_2024$House_Price_Appreciation_since_2012_percent =
as.numeric(housing_2024$House_Price_Appreciation_since_2012_percent)
housing_2024$House_Price_Appreciation_yr_over_yr_percent =
as.numeric(housing_2024$House_Price_Appreciation_yr_over_yr_percent)
housing_2024$New_Constr_by_share_of_sales_percent =
as.numeric(housing_2024$New_Constr_by_share_of_sales_percent)
housing_2024$Mortgage_Default_Rate_percent = as.numeric(housing_2024$Mortgage_Default_Rate_percent)
# For some reason is rounding the data in quite a weird way—inaccurately
# view data
housing_2024 %>% slice_sample(n=5)
## State County_Name FIPS_County_Code Year Affordability
## 1 KS Seward County 20175 2024 entrylevel
## 2 AR Clark County 5019 2024 moveup
## 3 IL Putnam County 17155 2024 all
## 4 KY Nicholas County 21181 2024 entrylevel
## 5 IL Cass County 17017 2024 moveup
## Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 1 NA NA
## 2 325 58.9
## 3 NA NA
## 4 94 98.4
## 5 233 NA
## House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 1 NA 3.4
## 2 2.3 7.9
## 3 NA 4.7
## 4 -1.5 1.9
## 5 NA 3.3
## New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 1 NA NA
## 2 10.3 12.9
## 3 NA NA
## 4 0.0 NA
## 5 0.0 NA
#creating df w/ affordability all totals and then another with entry level/moving up totals
housing_2024_all = housing_2024 %>% filter(housing_2024$Affordability == "all")
housing_2024_tiers = housing_2024 %>% filter(Affordability == "entrylevel" | Affordability == "moveup")
xkablesummary(housing_2024)
| State | County_Name | FIPS_County_Code | Year | Affordability | Median_Sale_Price_per_k | House_Price_Appreciation_since_2012_percent | House_Price_Appreciation_yr_over_yr_percent | Months_Supply | New_Constr_by_share_of_sales_percent | Mortgage_Default_Rate_percent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | TX : 730 | Washington County: 87 | 1001 : 3 | Min. :2024 | all :3046 | Min. : 14 | Min. : 3.7 | Min. :-32.90 | Min. : 0.00 | Min. : 0.0 | Min. : 0.2 |
| Q1 | GA : 477 | Jefferson County : 74 | 1003 : 3 | 1st Qu.:2024 | entrylevel:3043 | 1st Qu.:148 | 1st Qu.: 86.6 | 1st Qu.: 2.80 | 1st Qu.: 2.20 | 1st Qu.: 2.7 | 1st Qu.:10.2 |
| Median | KY : 360 | Franklin County : 72 | 1005 : 3 | Median :2024 | moveup :2996 | Median :255 | Median :105.2 | Median : 5.70 | Median : 3.20 | Median : 6.8 | Median :13.9 |
| Mean | MO : 334 | Jackson County : 68 | 1007 : 3 | Mean :2024 | NA | Mean :284 | Mean :110.7 | Mean : 5.88 | Mean : 4.37 | Mean : 10.7 | Mean :14.3 |
| Q3 | VA : 323 | Lincoln County : 66 | 1009 : 3 | 3rd Qu.:2024 | NA | 3rd Qu.:389 | 3rd Qu.:129.6 | 3rd Qu.: 8.70 | 3rd Qu.: 5.20 | 3rd Qu.: 14.9 | 3rd Qu.:17.5 |
| Max | IL : 304 | Madison County : 57 | 1011 : 3 | Max. :2024 | NA | Max. :999 | Max. :279.1 | Max. : 89.20 | Max. :24.00 | Max. :100.0 | Max. :36.0 |
| NA | (Other):6557 | (Other) :8661 | (Other):9067 | NA | NA | NA’s :990 | NA’s :2177 | NA’s :2204 | NA | NA’s :859 | NA’s :1960 |
#separated df's
xkablesummary(housing_2024_all)
| State | County_Name | FIPS_County_Code | Year | Affordability | Median_Sale_Price_per_k | House_Price_Appreciation_since_2012_percent | House_Price_Appreciation_yr_over_yr_percent | Months_Supply | New_Constr_by_share_of_sales_percent | Mortgage_Default_Rate_percent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | TX : 249 | Washington County: 29 | 1001 : 1 | Min. :2024 | all :3046 | Min. : 17 | Min. : 3.7 | Min. :-28.70 | Min. : 0.10 | Min. : 0.0 | Min. : 0.5 |
| Q1 | GA : 159 | Jefferson County : 25 | 1003 : 1 | 1st Qu.:2024 | entrylevel: 0 | 1st Qu.:140 | 1st Qu.: 88.9 | 1st Qu.: 3.00 | 1st Qu.: 2.30 | 1st Qu.: 2.9 | 1st Qu.:11.4 |
| Median | KY : 120 | Franklin County : 24 | 1005 : 1 | Median :2024 | moveup : 0 | Median :199 | Median :107.3 | Median : 5.95 | Median : 3.10 | Median : 6.4 | Median :14.5 |
| Mean | MO : 112 | Jackson County : 23 | 1007 : 1 | Mean :2024 | NA | Mean :236 | Mean :113.4 | Mean : 6.00 | Mean : 3.63 | Mean : 9.5 | Mean :14.8 |
| Q3 | VA : 108 | Lincoln County : 22 | 1009 : 1 | 3rd Qu.:2024 | NA | 3rd Qu.:300 | 3rd Qu.:132.6 | 3rd Qu.: 8.60 | 3rd Qu.: 4.30 | 3rd Qu.: 12.5 | 3rd Qu.:17.6 |
| Max | IL : 102 | Madison County : 19 | 1011 : 1 | Max. :2024 | NA | Max. :998 | Max. :278.0 | Max. : 89.20 | Max. :24.00 | Max. :100.0 | Max. :36.0 |
| NA | (Other):2196 | (Other) :2904 | (Other):3040 | NA | NA | NA’s :309 | NA’s :588 | NA’s :596 | NA | NA’s :276 | NA’s :618 |
xkablesummary(housing_2024_tiers)
| State | County_Name | FIPS_County_Code | Year | Affordability | Median_Sale_Price_per_k | House_Price_Appreciation_since_2012_percent | House_Price_Appreciation_yr_over_yr_percent | Months_Supply | New_Constr_by_share_of_sales_percent | Mortgage_Default_Rate_percent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | TX : 481 | Washington County: 58 | 1001 : 2 | Min. :2024 | all : 0 | Min. : 14 | Min. : 4.5 | Min. :-32.90 | Min. : 0.00 | Min. : 0.0 | Min. : 0.2 |
| Q1 | GA : 318 | Jefferson County : 49 | 1003 : 2 | 1st Qu.:2024 | entrylevel:3043 | 1st Qu.:153 | 1st Qu.: 85.4 | 1st Qu.: 2.70 | 1st Qu.: 2.20 | 1st Qu.: 2.5 | 1st Qu.: 9.7 |
| Median | KY : 240 | Franklin County : 48 | 1005 : 2 | Median :2024 | moveup :2996 | Median :310 | Median :104.4 | Median : 5.60 | Median : 3.40 | Median : 7.2 | Median :13.6 |
| Mean | MO : 222 | Jackson County : 45 | 1007 : 2 | Mean :2024 | NA | Mean :308 | Mean :109.2 | Mean : 5.82 | Mean : 4.74 | Mean : 11.3 | Mean :14.0 |
| Q3 | VA : 215 | Lincoln County : 44 | 1009 : 2 | 3rd Qu.:2024 | NA | 3rd Qu.:415 | 3rd Qu.:128.3 | 3rd Qu.: 8.70 | 3rd Qu.: 5.90 | 3rd Qu.: 16.0 | 3rd Qu.:17.5 |
| Max | IL : 202 | Madison County : 38 | 1011 : 2 | Max. :2024 | NA | Max. :999 | Max. :279.1 | Max. : 86.80 | Max. :24.00 | Max. :100.0 | Max. :36.0 |
| NA | (Other):4361 | (Other) :5757 | (Other):6027 | NA | NA | NA’s :681 | NA’s :1589 | NA’s :1608 | NA | NA’s :583 | NA’s :1342 |
ggplot(housing_2024, aes(x = reorder(State, -Median_Sale_Price_per_k, median),
y = Median_Sale_Price_per_k)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
coord_flip() +
labs(
title = "Distribution of Median Sale Prices by State (2024)",
x = "State",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
There are too many to be particularly useful. You can see general trends, but I am going to run this with a smaller state sample.
#boxplotting med sale price by state w/ ..._all df
#all
ggplot(housing_2024_all, aes(x = reorder(State, -Median_Sale_Price_per_k, median),
y = Median_Sale_Price_per_k)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
coord_flip() +
labs(
title = "Distribution of Median Sale Prices by State (2024)",
x = "State",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
top_states <- housing_2024 %>%
dplyr::count(State, sort = TRUE) %>%
dplyr::slice_max(n, n = 5)
bottom_states <- housing_2024 %>%
dplyr::count(State, sort = TRUE) %>%
dplyr::slice_min(n, n = 5)
# merge top and bottom states
housing_compare <- housing_2024 %>%
filter(State %in% c(top_states$State, bottom_states$State)) %>%
mutate(StateGroup = case_when(
State %in% top_states$State ~ "States with the Most Houses",
State %in% bottom_states$State ~ "States with the Fewest Houses"
))%>%
mutate(StateGroup = factor(StateGroup, levels = c(
"States with the Most Houses",
"States with the Fewest Houses"
)))
#plot top bottom comparison ####
ggplot(housing_compare, aes(x = State, y = Median_Sale_Price_per_k, fill = StateGroup)) +
geom_boxplot() +
facet_wrap(~ StateGroup, scales = "free_x") +
labs(
title = "Median Sale Price in States with Most vs Least Housing Records",
subtitle = "The Median Sale Price in States with a Larger Supply of Houses is Significantly Lower\nthan States with a Smaller Supply of Houses",
x = "State",
y = "Median Sale Price (in thousands)"
) +
theme_minimal() +
scale_fill_manual(values = c("States with the Most Houses" = "skyblue", "States with the Fewest Houses" = "red"))
#boxplotting with new df: ..._all
top_states_all <- housing_2024_all %>%
dplyr::count(State, sort = TRUE) %>%
dplyr::slice_max(n, n = 5)
bottom_states_all <- housing_2024_all %>%
dplyr::count(State, sort = TRUE) %>%
dplyr::slice_min(n, n = 5)
# merge top and bottom states
housing_compare_all <- housing_2024_all %>%
filter(State %in% c(top_states_all$State, bottom_states_all$State)) %>%
mutate(StateGroup = case_when(
State %in% top_states_all$State ~ "States with the Most Houses",
State %in% bottom_states_all$State ~ "States with the Fewest Houses"
))%>%
mutate(StateGroup = factor(StateGroup, levels = c(
"States with the Most Houses",
"States with the Fewest Houses"
)))
#plot top bottom comparison ####
ggplot(housing_compare_all, aes(x = State, y = Median_Sale_Price_per_k, fill = StateGroup)) +
geom_boxplot() +
facet_wrap(~ StateGroup, scales = "free_x") +
labs(
title = "Median Sale Price in States with Most vs Least Housing Records",
subtitle = "The Median Sale Price in States with a Larger Supply of Houses is Significantly Lower\nthan States with a Smaller Supply of Houses",
x = "State",
y = "Median Sale Price (in thousands)"
) +
theme_minimal() +
scale_fill_manual(values = c("States with the Most Houses" = "skyblue", "States with the Fewest Houses" = "red"))
ggplot(housing_2024, aes(x = Median_Sale_Price_per_k)) +
geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
labs(
title = "Distribution of Median Sale Prices (2024)",
x = "Median Sale Price (in thousands)",
y = "Count"
) +
theme_minimal()
#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me
#hist plotting with df ..._all
ggplot(housing_2024_all, aes(x = Median_Sale_Price_per_k)) +
geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
labs(
title = "Distribution of Median Sale Prices (2024)",
x = "Median Sale Price (in thousands)",
y = "Count"
) +
theme_minimal()
#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me
ggplot(housing_2024, aes(x = Affordability, y = Median_Sale_Price_per_k, fill = Affordability)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Affordability Tier (2024)",
x = "Affordability Tier",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
#no duh, there are more unpurchased expensive houses because people can't afford it
#not sure how useful this is, but maybe as a starting baseline
ggplot(housing_2024_all, aes(x = Months_Supply, y = Median_Sale_Price_per_k)) +
geom_point(color="steelblue", alpha = 0.7) +
# geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Housing Supply vs Median Sale Price",
x = "Months of Supply",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
#this is a good one, needs a commentary subtitle and some cleaning
housing_numeric <- housing_2024 %>%
select(where(is.numeric) & !all_of("Year")) %>%
drop_na()
cor_matrix <- cor(housing_numeric)
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black", number.cex=0.5)
#interesting to see positive and inverse relationships between variables
#did not scan for no relationships
#correlation heatmap of numeric variables w/ ..._all df
housing_numeric_all <- housing_2024_all %>%
select(where(is.numeric) & !all_of("Year")) %>%
drop_na()
cor_matrix_all <- cor(housing_numeric_all)
corrplot(cor_matrix_all, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black", number.cex=0.5)
# Create a vector of colors for top states (blue shades) and bottom states (red shades)
top_states_colors <- scales::seq_gradient_pal("lightblue", "darkslateblue")(seq(0, 1, length.out = length(unique(housing_compare$State[housing_compare$StateGroup == "States with the Most Houses"]))))
names(top_states_colors) <- unique(housing_compare$State[housing_compare$StateGroup == "States with the Most Houses"])
bottom_states_colors <- scales::seq_gradient_pal("lightpink", "darkred")(seq(0, 1, length.out = length(unique(housing_compare$State[housing_compare$StateGroup == "States with the Fewest Houses"]))))
names(bottom_states_colors) <- unique(housing_compare$State[housing_compare$StateGroup == "States with the Fewest Houses"])
#last point in line for state label
label_points <- housing_compare %>%
group_by(State) %>%
filter(Months_Supply == max(Months_Supply, na.rm = TRUE)) %>%
ungroup()
# Combine into one color vector
state_colors <- c(top_states_colors, bottom_states_colors)
#all in gray with faceted compare in color with states labeled ####
ggplot() +
geom_point(data = housing_2024, aes(x = Months_Supply, y = Median_Sale_Price_per_k),
color = "gray70", alpha = 0.3, size = 1) +
# geom_path(data = housing_compare,
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# size = 1, alpha = 0.8) +
geom_point(data = housing_compare,
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
size = 2, alpha = 0.5) +
geom_text_repel(data = label_points,
aes(x = Months_Supply, y = Median_Sale_Price_per_k,
label = State, color=State),
size = 3.5, stroke=0.01, show.legend = FALSE) +
facet_wrap(~ StateGroup) +
scale_color_manual(values = state_colors) +
labs(
title = "Housing Supply vs Median Price by State with Grouped Colors",
subtitle = "Some commentary here",
x = "Months of Supply",
y = "Median Sale Price (in thousands)",
color = "State"
) +
theme_minimal()
#can't read state names, maybe find a way to make it stand out
# compare color scale w/ ..._all df
# Create a vector of colors for top states (blue shades) and bottom states (red shades)
top_states_colors_all <- scales::seq_gradient_pal("lightblue", "darkslateblue")(seq(0, 1, length.out = length(unique(housing_compare_all$State[housing_compare_all$StateGroup == "States with the Most Houses"]))))
names(top_states_colors_all) <- unique(housing_compare_all$State[housing_compare_all$StateGroup == "States with the Most Houses"])
bottom_states_colors_all <- scales::seq_gradient_pal("lightpink", "darkred")(seq(0, 1, length.out = length(unique(housing_compare_all$State[housing_compare_all$StateGroup == "States with the Fewest Houses"]))))
names(bottom_states_colors_all) <- unique(housing_compare_all$State[housing_compare_all$StateGroup == "States with the Fewest Houses"])
#last point in line for state label
label_points_all <- housing_compare_all %>%
group_by(State) %>%
filter(Months_Supply == max(Months_Supply, na.rm = TRUE)) %>%
ungroup()
# Combine into one color vector
state_colors_all <- c(top_states_colors_all, bottom_states_colors_all)
#all in gray with faceted compare in color with states labeled ####
ggplot() +
geom_point(data = housing_2024_all, aes(x = Months_Supply, y = Median_Sale_Price_per_k),
color = "gray70", alpha = 0.3, size = 1) +
# geom_path(data = housing_compare,
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# size = 1, alpha = 0.8) +
geom_point(data = housing_compare_all,
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
size = 2, alpha = 0.5) +
geom_text_repel(data = label_points_all,
aes(x = Months_Supply, y = Median_Sale_Price_per_k,
label = State, color=State),
size = 3.5, stroke=0.01, show.legend = FALSE) +
facet_wrap(~ StateGroup) +
scale_color_manual(values = state_colors_all) +
labs(
title = "Housing Supply vs Median Price by State with Grouped Colors",
subtitle = "Some commentary here",
x = "Months of Supply",
y = "Median Sale Price (in thousands)",
color = "State"
) +
theme_minimal()
#can't read state names, maybe find a way to make it stand out
ggplot() +
geom_point(data = housing_2024,
aes(x = Months_Supply, y = Median_Sale_Price_per_k),
color = "gray70", alpha = 0.3, size = 1) +
# geom_path(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# alpha = 0.8, size = 1) +
geom_point(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
alpha = 0.5, size = 2) +
# geom_path(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# alpha = 0.8, size = 1) +
geom_point(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
alpha = 0.5, size = 2) +
scale_color_manual(values = state_colors) +
labs(
title = "Housing Supply vs Median Price: All Counties with Highlights",
subtitle = "Gray points: All counties | Blue shades: States with Most Houses | Red shades: States with Fewest Houses",
x = "Months of Supply",
y = "Median Sale Price (in thousands)",
color = "State"
) +
theme_minimal()
#both together in gray with compare in color with states labeled w/ ..._all df
ggplot() +
geom_point(data = housing_2024_all,
aes(x = Months_Supply, y = Median_Sale_Price_per_k),
color = "gray70", alpha = 0.3, size = 1) +
# geom_path(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# alpha = 0.8, size = 1) +
geom_point(data = housing_compare_all %>% filter(StateGroup == "States with the Most Houses"),
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
alpha = 0.5, size = 2) +
# geom_path(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# alpha = 0.8, size = 1) +
geom_point(data = housing_compare_all %>% filter(StateGroup == "States with the Fewest Houses"),
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
alpha = 0.5, size = 2) +
scale_color_manual(values = state_colors_all) +
labs(
title = "Housing Supply vs Median Price: All Counties with Highlights",
subtitle = "Gray points: All counties | Blue shades: States with Most Houses | Red shades: States with Fewest Houses",
x = "Months of Supply",
y = "Median Sale Price (in thousands)",
color = "State"
) +
theme_minimal()
housing_constr = housing_2024
housing_constr = housing_constr %>%
mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))
ggplot(housing_constr, aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Percent New Construction (2024)",
x = "New Construction by Share of Sales Percent",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?
#new construction vs median sale price w/ ..._all df
housing_constr_all = housing_2024_all
housing_constr_all = housing_constr_all %>%
mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))
ggplot(housing_constr_all, aes(x = constr_bins, y = Median_Sale_Price_per_k, fill=constr_bins)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Percent New Construction (2024)",
x = "New Construction by Share of Sales Percent",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?
housing_constr_tiers = housing_2024_tiers
housing_constr_tiers = housing_constr_tiers %>%
mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))
ggplot(filter(housing_constr,Affordability=="entrylevel"), aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Percent New Construction (2024)",
x = "New Construction by Share of Sales Percent",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
# Even more pronounced here
## Entry-Level Focused Analysis
### Boxplot of Median Sale Price by State (Entry-Level Only)
housing_entrylevel <- housing_2024_tiers %>%
filter(Affordability == "entrylevel")
housing_moveup <- housing_2024_tiers %>%
filter(Affordability == "moveup")
ggplot(housing_entrylevel, aes(x = reorder(State, -Median_Sale_Price_per_k, median),
y = Median_Sale_Price_per_k)) +
geom_boxplot(fill = "skyblue", alpha = 0.7) +
coord_flip() +
labs(
title = "Median Sale Price by State (Entry-Level Homes, 2024)",
x = "State",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
ggplot(housing_entrylevel, aes(x = Median_Sale_Price_per_k)) +
geom_histogram(binwidth = 50, fill = "lightgreen", color = "black") +
scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
labs(
title = "Distribution of Median Sale Prices (Entry-Level Homes, 2024)",
x = "Median Sale Price (in thousands)",
y = "Number of Counties"
) +
theme_minimal()
ggplot(housing_2024_tiers, aes(x = Median_Sale_Price_per_k, fill = Affordability)) +
geom_histogram(
binwidth = 50,
color = "black",
alpha = 0.5,
position = "identity"
) +
scale_fill_manual(
values = c("entrylevel" = "lightgreen", "moveup" = "steelblue"),
name = "Home Type",
labels = c("Entry-Level", "Move-Up")
) +
scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
labs(
title = "Distribution of Median Sale Prices (Entry-Level vs Move-Up, 2024)",
x = "Median Sale Price (in thousands)",
y = "Count"
) +
theme_minimal()
ggplot(housing_entrylevel, aes(x = Months_Supply, y = Median_Sale_Price_per_k)) +
geom_point(color = "tomato", alpha = 0.6) +
labs(
title = "Months of Supply vs Median Sale Price (Entry-Level Homes, 2024)",
x = "Months of Supply",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
housing_numeric <- housing_2024 %>%
select(where(is.numeric), -Year) %>%
drop_na()
cor_matrix <- cor(housing_numeric)
corrplot(cor_matrix, method = "color", type = "upper",
tl.cex = 0.8, addCoef.col = "black", number.cex = 0.5)